# General-purpose data wrangling
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.3 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.0 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# Graphing
library(ggplot2)
# Date manipulation
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.0.5
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
# Scales manipulation
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
break_log_cust <- function(range) {
breaks_custom <- c()
for (x in range) {
for (i in 1:9) {
breaks_custom <- c(breaks_custom, i*10^x)
}
}
return(breaks_custom)
}
datasets <- read_rds("../data/datasets/epsrc_grant_datasets.rds")
datasets
## $person_roles
## # A tibble: 73,218 x 4
## person_role_id grant_ref role person_id
## <chr> <fct> <fct> <fct>
## 1 578F78BD-E807-4C27-B638-DF6A80C58264 EP/G026440/1 PRINCIPAL~ 390D96F7-5924-4~
## 2 F4682871-ED94-443E-B640-46F573504687 EP/I006591/1 PRINCIPAL~ 0A4952F9-DDA2-4~
## 3 3BC8723D-5584-4657-ABC4-E61F0DBEDAFA EP/F032145/1 PRINCIPAL~ AF612336-CBA1-4~
## 4 540DABDB-9231-4101-9E7C-28128D884EA5 EP/F032145/1 CO_INVEST~ 690F821C-1EDD-4~
## 5 4DA004EF-8282-43D6-B5DE-7AEE47E3FF23 EP/K012258/1 PRINCIPAL~ 1DB55081-3907-4~
## 6 AD7D0F55-3621-46BF-8B5A-C6FF630991C1 EP/K012258/1 CO_INVEST~ 6024802D-8803-4~
## 7 C3D97C2E-0EE6-497B-B5AD-6430FD74E662 EP/K012258/1 CO_INVEST~ 8E16681F-5AB4-4~
## 8 406F342D-8747-4542-97F3-25551E437A0F EP/K012258/1 CO_INVEST~ 65FE98F7-4153-4~
## 9 0F2B8601-42C9-4335-B91C-D063F59FE7F3 EP/S513817/1 TRAINING_~ 6D1727F8-71D0-4~
## 10 9FDF33D0-8310-4945-9274-09AECC73EAE1 EP/H023909/1 CO_INVEST~ A3D04976-A9D9-4~
## # ... with 73,208 more rows
##
## $people
## # A tibble: 31,261 x 4
## person_id first_name other_names surname
## <chr> <chr> <chr> <chr>
## 1 390D96F7-5924-476A-B7E3-E87694AA1FE0 tanniemola bunting liverpool
## 2 0A4952F9-DDA2-4767-833F-3E4B5BE2F878 mike <NA> gregory
## 3 AF612336-CBA1-4927-829C-883C19D94EA0 hua <NA> dong
## 4 690F821C-1EDD-45DF-AD28-85CA4A037985 robert <NA> macredie
## 5 1DB55081-3907-4397-98C1-3E6CBA25FD3E robin bruce bedford
## 6 6024802D-8803-4519-89B4-40D5B4361A2F david j fermin
## 7 8E16681F-5AB4-4AEC-B1E8-40ED46301A01 guy <NA> lloyd-jones
## 8 65FE98F7-4153-470B-A575-E03BBFA2CCDC jeremy noel harvey
## 9 6D1727F8-71D0-42BB-BDE1-4ED8BA097323 elizabeth <NA> towns-andrews
## 10 A3D04976-A9D9-4851-8D8D-26115BFE0BC6 julie <NA> macpherson
## # ... with 31,251 more rows
##
## $name_gender
## # A tibble: 6,823 x 3
## name proportion_male count
## <chr> <dbl> <dbl>
## 1 a 0.68 19908
## 2 aad 0.95 863
## 3 aaditya 1 100
## 4 aakanksha 0.0400 49
## 5 aakeen NA NA
## 6 aakifah 0 6
## 7 aaliyah 0.0100 673
## 8 aamal 0.0700 15
## 9 aamina 0.0600 32
## 10 aamna 0.0400 46
## # ... with 6,813 more rows
##
## $organisation_roles
## # A tibble: 93,571 x 4
## org_role_id grant_ref org_role org_id
## <chr> <fct> <fct> <fct>
## 1 CD22E167-9295-49C1-A774-788E7EACA160 EP/G026440/1 LEAD_RO 4A348A76-B~
## 2 97907792-9F95-421F-A9D1-8D813DB6A719 EP/G026440/1 COLLABORATOR 3EAE04CA-9~
## 3 7F9E4C86-998D-474A-88C3-84CC45779C04 EP/G026440/1 COLLABORATOR 46B41008-0~
## 4 5203030E-E220-4F5E-A0CB-C120F750D7FB EP/G026440/1 COLLABORATOR 7DD38B9D-A~
## 5 827544D6-1434-490B-B7DF-A4F6C1AB5A44 EP/G026440/1 PROJECT_PARTNER CAC1288E-6~
## 6 2FA832A7-FE28-4EBA-9865-DA0F19D7DDE4 EP/I006591/1 LEAD_RO D1774113-D~
## 7 5CB8DA76-7BEC-4253-9116-284468B8CBF2 EP/F032145/1 LEAD_RO BECA2763-0~
## 8 1B9CC61F-1154-426F-B88A-7A95494AEBD1 EP/F032145/1 PROJECT_PARTNER 87ED9DCB-0~
## 9 8CC6B9E1-B193-4D9E-85AE-C6637D1F40B4 EP/F032145/1 PROJECT_PARTNER 532931BB-1~
## 10 F160A4AF-E424-439C-AB9C-C5FFFD9FF12A EP/F032145/1 PROJECT_PARTNER 08D2B6FC-E~
## # ... with 93,561 more rows
##
## $organisations
## # A tibble: 16,395 x 12
## org_id org_name org_url address_line1 address_post_co~ address_region
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 4A348A7~ Universi~ https://gtr~ 5 Tyndall Av~ BS8 1UD South West
## 2 3EAE04C~ Universi~ https://gtr~ University C~ OX1 2JD South East
## 3 46B4100~ Durham U~ https://gtr~ Palatine Cen~ DH1 3LE Unknown
## 4 7DD38B9~ Tohoku U~ https://gtr~ <NA> <NA> <NA>
## 5 CAC1288~ Syracuse~ https://gtr~ Syracuse Uni~ NY 13244-4 Outside UK
## 6 D177411~ Universi~ https://gtr~ Lensfield Ro~ CB2 1EW Unknown
## 7 BECA276~ Brunel U~ https://gtr~ Kingston Lane UB8 3PH Unknown
## 8 87ED9DC~ Wright D~ https://gtr~ 1a Norwich S~ CB2 1ND East of Engla~
## 9 532931B~ Ricabili~ https://gtr~ Unit 10, Ble~ N7 9NY Unknown
## 10 08D2B6F~ Granta D~ https://gtr~ Rustat House CB1 7EG East of Engla~
## # ... with 16,385 more rows, and 6 more variables: address_country <chr>,
## # address_line2 <chr>, address_line4 <chr>, address_line5 <chr>,
## # address_line3 <chr>, address_city <chr>
##
## $subject_weight
## # A tibble: 25,082 x 4
## subject_weight_id grant_ref subject_id percentage
## <chr> <fct> <fct> <dbl>
## 1 D666A364-FF4E-4CEA-83D6-C85CDF5496DD EP/G026440/1 17F21DBC-4946-4~ 50
## 2 4DB01636-14E2-4E0D-8EAE-4F8971C3DFD5 EP/G026440/1 E847DD9B-2053-4~ 50
## 3 0E58470D-3A0D-42EE-AAD3-86BD4EC2E8DE EP/I006591/1 5E9AA4EC-49E3-4~ 100
## 4 ACE2E454-919D-43AA-9F88-5991B5BA1CEA EP/F032145/1 5E9AA4EC-49E3-4~ 100
## 5 5CEE216E-4132-42E3-B504-D856E5B0F21E EP/K012258/1 64596636-AE88-4~ 100
## 6 AFA8A43B-F969-4440-9EA4-C6BA3E49E607 EP/H023909/1 64596636-AE88-4~ 40
## 7 F13F0FF9-A847-4FCC-840B-AD92E5C91A86 EP/H023909/1 E847DD9B-2053-4~ 60
## 8 598878F7-41CC-4B16-B59A-9ECD57468347 EP/I004246/1 EB5F16BB-2772-4~ 100
## 9 43299E65-463F-492F-82C4-705183935851 EP/H049746/1 772CD758-53CD-4~ 100
## 10 68E2C54B-7097-4E2E-94D1-CAFD865A1A7F EP/K007777/1 F95121C7-7C31-4~ 100
## # ... with 25,072 more rows
##
## $subjects
## # A tibble: 74 x 2
## subject_id subject
## <chr> <fct>
## 1 17F21DBC-4946-4262-AD07-9914F6EA12F7 Bioengineering
## 2 E847DD9B-2053-4BC7-97A9-4213B185C968 Materials sciences
## 3 5E9AA4EC-49E3-4D6A-B545-79DA07CE39E0 Management & Business Studies
## 4 64596636-AE88-4F6E-A816-9C4C203E1197 Catalysis & surfaces
## 5 EB5F16BB-2772-4DDE-BD6C-3B7A6914B64C Info. & commun. Technol.
## 6 772CD758-53CD-407F-9B2C-F2B861E86155 Mechanical Engineering
## 7 F95121C7-7C31-40CE-BDBC-1E2BF025AEA6 Energy
## 8 1A1A6805-9DC4-4BCE-BC70-9F2AA4FD093B Ecol, biodivers. & systematics
## 9 DEA11FBC-BEED-4EDD-890B-97D728462D26 Mathematical sciences
## 10 FB535BD0-E265-4C0A-8532-32DCB83A3951 Tools, technologies & methods
## # ... with 64 more rows
##
## $topic_weight
## # A tibble: 45,439 x 4
## topic_weight_id grant_ref topic_id percentage
## <chr> <fct> <fct> <dbl>
## 1 FBB8A358-AA0E-41CE-9D63-67D1CE5DB753 EP/G026440/1 BD108981-2A72-4~ 50
## 2 AE70E5D8-B282-425B-92D3-84D0C712FC84 EP/G026440/1 561091A1-9FC9-4~ 25
## 3 D5407096-419B-4ABC-917E-5FDEA1040371 EP/G026440/1 063026C5-CD0B-4~ 25
## 4 9D8FA88E-9FEA-42EC-8D1A-C1F4300E8FCB EP/I006591/1 6A482B09-D749-4~ 100
## 5 E1965173-A691-4FEB-9382-3BAB2074839B EP/F032145/1 6A482B09-D749-4~ 100
## 6 EED7986B-06E6-4D50-A1AD-4CBBAAE122F8 EP/K012258/1 35E9CECD-F211-4~ 100
## 7 2CFEAB18-BEA8-4B28-9DAD-D9BDE6E4B2A0 EP/S513817/1 D05BC2E0-0345-4~ NA
## 8 E9F1D628-29CC-4FB8-921A-9E1F4934E2AC EP/H023909/1 561091A1-9FC9-4~ 30
## 9 1AE1739D-5F9B-48F6-91C2-0831DCBBBCF9 EP/H023909/1 063026C5-CD0B-4~ 30
## 10 2EDB060D-4FF5-4327-BD53-C22526FE1585 EP/H023909/1 0C89BB7D-DCEF-4~ 40
## # ... with 45,429 more rows
##
## $topics
## # A tibble: 343 x 2
## topic_id topic
## <chr> <fct>
## 1 BD108981-2A72-4C45-AAC2-425461EB90BE Biomaterials
## 2 561091A1-9FC9-4508-B1B2-2F3623E1FC9D Materials Characterisation
## 3 063026C5-CD0B-48AB-82F3-6359499A95D4 Materials Synthesis & Growth
## 4 6A482B09-D749-4443-8AF0-654984FCE91D Manufact. Business Strategy
## 5 35E9CECD-F211-4914-9F91-4B8FB76B773D Catalysis & Applied Catalysis
## 6 D05BC2E0-0345-4A3F-8C3F-775BC42A0819 Unclassified
## 7 0C89BB7D-DCEF-4DCF-84DB-CA53C907AB0E Surfaces & Interfaces
## 8 76783275-A9F8-4B4E-B314-51363124259C Fundamentals of Computing
## 9 0A982A4A-12CF-4734-AFCA-A5DC61F667F3 Information & Knowledge Mgmt
## 10 ED6338AE-3457-45D6-90CA-B994C3CF422B Aerodynamics
## # ... with 333 more rows
##
## $grant_themes
## # A tibble: 8,918 x 3
## grant_theme_id grant_ref theme_id
## <chr> <fct> <fct>
## 1 02bf5930-df87-41e5-86fa-7d66659591f2 EP/V028855/1 ea880e85-f413-4373-98af-42~
## 2 a90cadd6-3977-40be-a687-857635e90c48 EP/V05385X/1 ea880e85-f413-4373-98af-42~
## 3 a6b639ed-70d7-4e7a-a8ce-f980a9164060 EP/V00817X/1 ea880e85-f413-4373-98af-42~
## 4 e75a7ed0-726c-462e-a15f-3e8f2ea719e5 EP/V029975/1 ea880e85-f413-4373-98af-42~
## 5 4270f864-77a4-48c4-a2a7-518574e3b50f EP/T012595/1 ea880e85-f413-4373-98af-42~
## 6 b3add47a-4a72-484e-87e8-6758e71063f7 EP/V007785/1 ea880e85-f413-4373-98af-42~
## 7 7d2954c4-d219-4009-a33f-18f73ef08025 EP/V007750/1 ea880e85-f413-4373-98af-42~
## 8 e5f314ea-129d-44dd-b36e-f135a6123412 EP/V007742/1 ea880e85-f413-4373-98af-42~
## 9 20131c65-ad81-463f-8ccc-78c9a2c76dec EP/V012126/1 ea880e85-f413-4373-98af-42~
## 10 9be9bf53-3cac-40ba-af32-44d4504a6b18 EP/V012037/1 ea880e85-f413-4373-98af-42~
## # ... with 8,908 more rows
##
## $themes
## # A tibble: 638 x 4
## theme_id theme_type theme_name theme_research_a~
## <chr> <fct> <fct> <fct>
## 1 ea880e85-f413-4373-98af-428a6a52e465 Challenge Engineering Analytical Scien~
## 2 645516da-c627-4ea8-b057-d35168d65986 Challenge Engineering Architecture and~
## 3 6c704d32-3a2a-462e-a1ae-916fc7506c5e Challenge Engineering Artificial Intel~
## 4 41fbadd0-19b8-48de-a012-fe2a8401354d Challenge Engineering Assistive Techno~
## 5 85001c51-b6ce-4e0a-b6a4-d637852be41e Challenge Engineering Bioenergy
## 6 096cb244-b815-4adc-bc4e-c9a216d6f307 Challenge Engineering Biological Infor~
## 7 d1fe47c3-a1bf-4b34-b804-4d574d859e9b Challenge Engineering Biomaterials and~
## 8 34891f64-93be-4900-9092-5ab9f4649b36 Challenge Engineering Biophysics and S~
## 9 c7cd59fc-7e7c-45d9-a706-aeca4cb6360b Challenge Engineering Built Environment
## 10 0fbf5993-46c7-471d-a33c-a51d08e40012 Challenge Engineering Carbon Capture a~
## # ... with 628 more rows
##
## $grants
## # A tibble: 29,805 x 11
## grant_ref grant_id grant_title grant_abstract lead_ro_departm~ grant_category
## <chr> <chr> <chr> <chr> <chr> <fct>
## 1 EP/G026440/1 66B9C31~ NSF Materi~ "This proposa~ Mathematics Research Grant
## 2 EP/I006591/1 67133CB~ UK-Japan I~ "The aim of t~ Engineering Research Grant
## 3 EP/F032145/1 68679F7~ Facilitati~ "Inclusive de~ Sch of Engineer~ Research Grant
## 4 EP/K012258/1 8D1B2AE~ The Ironwo~ "The formatio~ Chemistry Research Grant
## 5 EP/S513817/1 BFED96A~ Industrial~ "Doctoral Tra~ Research and En~ Training Grant
## 6 EP/H023909/1 C18E548~ Uncovering~ "Electrochemi~ Chemistry Research Grant
## 7 EP/I004246/1 8AF8AD2~ Foundation~ "Many importa~ Dept of Computi~ Fellowship
## 8 EP/H049746/1 8B040A0~ Equipment ~ "Aerospace is~ Div of Material~ Research Grant
## 9 EP/P504546/1 407EE17~ DHPA - Kee~ "Doctoral Tra~ Research Develo~ Training Grant
## 10 EP/N509346/1 175AFA1~ Industrial~ "Doctoral Tra~ Research Finance Training Grant
## # ... with 29,795 more rows, and 5 more variables: start_date <date>,
## # end_date <date>, award_value <dbl>, region <fct>, project_open <lgl>
amount_start_date <- datasets$grants %>%
select(
start_date,
award_value
)
amount_start_year <- amount_start_date %>%
mutate(
start_year = as_factor(year(start_date))
) %>%
group_by(
start_year
) %>%
summarise(
mean_award_value = mean(award_value),
total_award_value = sum(award_value)
)
amount_start_year
amount_start_month <- amount_start_date %>%
mutate(
start_month = as_factor(month(start_date, label = TRUE, abbr = TRUE))
) %>%
group_by(
start_month
) %>%
summarise(
mean_award_value = mean(award_value),
total_award_value = sum(award_value)
)
amount_start_month
amount_start_year_month <- amount_start_date %>%
mutate(
start_year = as_factor(year(start_date)),
start_month = month(start_date, label = TRUE, abbr = TRUE),
# start_quater = as_factor(quarter(start_date))
) %>%
group_by(
start_year,
start_month,
# start_quater
) %>%
summarise(
mean_award_value = mean(award_value),
total_award_value = sum(award_value)
)
## `summarise()` has grouped output by 'start_year'. You can override using the `.groups` argument.
amount_start_year_month
amount_start_year %>%
ggplot(
mapping = aes(
x = start_year,
y = mean_award_value,
fill = start_year
)
) +
geom_col(show.legend = FALSE) +
scale_y_continuous(
name = "Mean Award Value",
labels = label_dollar(prefix = "£", suffix = ""),
breaks = extended_breaks(8),
expand = expansion(mult = c(0, .05))
) +
scale_x_discrete(
name = "Start Year"
) +
scale_fill_discrete()
amount_start_month %>%
ggplot(
mapping = aes(
x = start_month,
y = mean_award_value,
group = start_month,
# fill = hue_pal()(12)
fill = start_month
)
) +
geom_col(show.legend = FALSE) +
scale_y_continuous(
name = "Mean Award Value",
labels = label_dollar(prefix = "£", suffix = ""),
breaks = extended_breaks(8),
expand = expansion(mult = c(0, .05))
) +
scale_x_discrete(
name = "Start Month"
) +
scale_fill_discrete()
amount_start_year_month %>%
ggplot() +
geom_line(
mapping = aes(
x = start_year,
y = mean_award_value,
group = start_month,
colour = start_month
),
size = 1
) +
scale_y_continuous(
name = "Mean Award Value",
labels = label_dollar(prefix = "£", suffix = ""),
# breaks = extended_breaks(8),
expand = expansion(mult = c(0, .05))
) +
scale_x_discrete(
name = "Start Year"
) +
scale_color_discrete(
name = "Start Month"
) +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5)
)
amount_start_year %>%
ggplot(
mapping = aes(
x = start_year,
y = total_award_value,
fill = start_year
)
) +
geom_col(show.legend = FALSE) +
scale_y_continuous(
name = "Total Award Value",
labels = label_dollar(prefix = "£", suffix = ""),
breaks = extended_breaks(8),
expand = expansion(mult = c(0, .05))
) +
scale_x_discrete(
name = "Start Year"
)
amount_start_month %>%
ggplot(
mapping = aes(
x = start_month,
y = total_award_value,
group = start_month,
fill = start_month
)
) +
geom_col(show.legend = FALSE) +
scale_y_continuous(
name = "Total Award Value",
labels = label_dollar(prefix = "£", suffix = ""),
breaks = extended_breaks(8),
expand = expansion(mult = c(0, .05))
) +
scale_x_discrete(
name = "Start Month"
) +
scale_fill_discrete()
amount_start_year_month %>%
ggplot(
mapping = aes(
x = start_year,
y = total_award_value,
group = start_month,
col = start_month
)
) +
geom_line(size = 1) +
scale_y_continuous(
name = "Total Award Value",
labels = label_dollar(prefix = "£", suffix = ""),
breaks = extended_breaks(8),
expand = expansion(mult = c(0, .05))
) +
scale_x_discrete(
name = "Start Year",
) +
scale_color_discrete(
name = "Start Month"
) +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5)
)
## Distribtuion of Award Value
amount_start_date %>%
mutate(
start_year = as_factor(year(start_date))
) %>%
group_by(start_year) %>%
# filter(award_value != 0) %>%
ggplot() +
geom_boxplot(
mapping = aes(
x = start_year,
y = award_value,
group = start_year,
colour = start_year
),
# outlier.shape = NA,
) +
scale_y_continuous(
name = "Award Value",
labels = label_dollar(prefix = "£"),
trans = pseudo_log_trans(base = 10),
breaks = c(0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8),
expand = expansion(mult = c(0, .05))
) +
theme(
legend.position = "none"
) +
labs(
title = "Yearly Distribution of Award Value - Zero Award Values excluded"
)
amount_start_date %>%
mutate(
start_year = as_factor(year(start_date))
) %>%
group_by(start_year) %>%
filter(award_value != 0) %>%
ggplot() +
geom_boxplot(
mapping = aes(
x = start_year,
y = award_value,
group = start_year,
colour = start_year
),
# outlier.shape = NA,
) +
scale_y_continuous(
name = "Award Value",
labels = label_dollar(prefix = "£"),
trans = pseudo_log_trans(base = 10),
breaks = c(0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8),
expand = expansion(mult = c(0, .05))
) +
theme(
legend.position = "none"
) +
scale_x_discrete(
name = "Start Year"
) +
labs(
title = "Yearly Distribution of Award Value - Zero Award Values excluded"
)
datasets$grants %>%
ggplot() +
geom_histogram(
aes(x = award_value),
bins = 75,
fill = hue_pal()(1)
) +
scale_x_continuous(
name = "Award Value",
labels = label_dollar(prefix = "£"),
trans = pseudo_log_trans(base = 10),
breaks = c(0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8),
minor_breaks = break_log_cust(0:8),
expand = expansion(mult = c(0, .05))
) +
scale_y_continuous(
name = "Count",
breaks_extended(10),
expand = expansion(mult = c(0, .05))
) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
datasets$grants %>%
filter(award_value != 0) %>%
ggplot() +
geom_histogram(
mapping = aes(
x = award_value
),
bins = 75,
fill = hue_pal()(1)
) +
scale_x_log10(
name = "Award Value",
labels = label_dollar(prefix = "£"),
breaks = breaks_log(6),
minor_breaks = break_log_cust(3:8),
expand = expansion(mult = c(0, .05))
) +
scale_y_continuous(
name = "Count",
breaks_extended(10),
expand = expansion(mult = c(0, .05))
) +
theme(
legend.position = "none"
)
# Lead Research Organisations and Grant Funding
org_roles_joined <- datasets$organisation_roles %>% left_join(datasets$organisations, by = "org_id")
grants_x_orgs <- datasets$grants %>% left_join(org_roles_joined, by = "grant_ref")
org_funding <- grants_x_orgs %>%
select(
grant_ref,
award_value,
org_role,
org_name
)
org_funding
summary_org <- org_funding %>%
group_by(org_name, org_role) %>%
summarise(
total_award_amount = sum(award_value),
mean_award_amount = mean(award_value),
number_of_grants = n()
)
## `summarise()` has grouped output by 'org_name'. You can override using the `.groups` argument.
summary_org
summary_org %>%
filter(org_role == "LEAD_RO") %>%
ggplot() +
geom_density(
aes(
x = total_award_amount,
..count..
),
# stat = "bin",
# bins = 40,
col = hue_pal()(3)[1],
# fill = hue_pal()(1)
size = 0.75
) +
scale_x_log10(
name = "Total Amount Awarded",
labels = label_dollar(prefix = "£"),
breaks = breaks_log(6),
expand = expansion(mult = c(0, .05))
) +
scale_y_continuous(
name = "Count",
breaks_extended(10),
expand = expansion(mult = c(0, .05))
) +
labs(
title = "Distribution of Total Amount Awarded to Lead Reasearch Organisations"
)
summary_org %>%
filter(org_role == "LEAD_RO") %>%
ggplot() +
geom_density(
aes(
x = mean_award_amount,
..count..
),
# stat = "bin",
# bins = 40,
col = hue_pal()(3)[2],
# fill = hue_pal()(1)
size = 0.75
) +
scale_x_log10(
name = "Mean Amount Awarded",
labels = label_dollar(prefix = "£"),
minor_breaks = break_log_cust(3:7),
expand = expansion(mult = c(0, .05))
) +
scale_y_continuous(
name = "Count",
breaks_extended(10),
expand = expansion(mult = c(0, .05))
) +
labs(
title = "Distribution of Mean Amount Awarded to Lead Reasearch Organisations"
)
summary_org %>%
filter(org_role == "LEAD_RO") %>%
ggplot() +
geom_density(
aes(
x = number_of_grants,
..count..
),
# stat = "bin",
# bins = 40,
col = hue_pal()(3)[3],
# fill = hue_pal()(1)
size = 0.75
) +
scale_x_log10(
name = "Total Number of Grants Awarded",
minor_breaks = break_log_cust(0:3),
expand = expansion(mult = c(0, .05))
) +
# scale_x_continuous(
# name = "Total Number of Grants Awarded",
# breaks = breaks_extended(10),
# ) +
scale_y_continuous(
name = "Count",
breaks_extended(10),
expand = expansion(mult = c(0, .05))
) +
labs(
title = "Distribution of Total Number of Grants Awarded to Lead Reasearch Organisations"
)
# Which Lead Research Organisations receive the most funding?
summary_org %>%
filter(org_role == "LEAD_RO") %>%
arrange(desc(total_award_amount)) %>%
head(15) %>%
ggplot() +
geom_col(
aes(
x = total_award_amount,
y = reorder(org_name, total_award_amount),
),
fill = hue_pal()(3)[1]
) +
scale_x_continuous(
name = "Total Award Amount",
labels = label_dollar(prefix = "£"),
breaks = breaks_extended(),
minor_breaks = breaks_extended(10),
expand = expansion(mult = c(0, .01))
) +
scale_y_discrete(
name = ""
) +
theme(
# plot.margin = unit(c(0.5,1.5,0.5,0), "cm")
) +
labs(
title = "Top 15 Lead Reasearch Organisations by Total Amount Awarded"
)
summary_org %>%
filter(org_role == "LEAD_RO") %>%
arrange(desc(mean_award_amount)) %>%
head(15) %>%
ggplot() +
geom_col(
aes(
x = mean_award_amount,
y = reorder(org_name, mean_award_amount),
),
fill = hue_pal()(3)[2]
) +
# scale_x_continuous(
# name = "Mean Award Amount",
# labels = label_dollar(prefix = "£"),
# expand = expansion(mult = c(0, .01)),
# ) +
scale_x_log10(
name = "Mean Award Amount",
labels = label_dollar(prefix = "£"),
expand = expansion(mult = c(0, .01)),
minor_breaks = break_log_cust(0:7)
) +
scale_y_discrete(
name = ""
) +
theme(
# plot.margin = unit(c(0.5,1.5,0.5,0), "cm")
) +
labs(
title = "Top 15 Lead Reasearch Organisations by Mean \nAmount Awarded"
)
summary_org %>%
filter(org_role == "LEAD_RO") %>%
arrange(desc(number_of_grants)) %>%
head(15) %>%
ggplot() +
geom_col(
aes(
x = number_of_grants,
y = reorder(org_name, number_of_grants),
),
fill = hue_pal()(3)[1]
) +
scale_x_continuous(
name = "Count of Grants Awarded",
expand = expansion(mult = c(0, .05)),
minor_breaks = breaks_extended(25)
) +
scale_y_discrete(
name = ""
) +
labs(
title = "Top 15 Lead Reasearch Organisations by Total Number of \nGrants Awarded"
)
# Grant Subjects and Topics
grants_x_subjects <- datasets$grants %>%
left_join(datasets$subject_weight, by = "grant_ref") %>%
left_join(datasets$subjects, by = "subject_id") %>%
select(
grant_ref,
subject,
percentage,
award_value
) %>%
replace_na(
replace = list(
percentage = 100
)
) %>%
mutate(
percentage = percentage/100,
weighted_award_value = percentage * award_value
)
grants_x_subjects_summary <- grants_x_subjects %>%
group_by(subject) %>%
summarise(
total_weighted_award_value = sum(weighted_award_value),
mean_percentage = mean(percentage),
mean_weighted_award_value = mean(weighted_award_value),
total_number_of_grants = n()
)
grants_x_subjects_summary
grants_x_topics <- datasets$grants %>%
left_join(datasets$topic_weight, by = "grant_ref") %>%
left_join(datasets$topics, by = "topic_id") %>%
select(
grant_ref,
topic,
percentage,
award_value
) %>%
replace_na(
replace = list(
topic = "Unclassified",
percentage = 100
)
) %>%
mutate(
percentage = percentage/100,
weighted_award_value = percentage * award_value
)
grants_x_topics_summary <- grants_x_topics %>%
group_by(topic) %>%
summarise(
total_weighted_award_value = sum(weighted_award_value),
mean_percentage = mean(percentage),
mean_weighted_award_value = mean(weighted_award_value),
total_number_of_grants = n()
)
grants_x_topics_summary
grants_x_subjects_summary %>%
ggplot() +
geom_density(
aes(
x = total_weighted_award_value,
col = "total_weighted_award_value",
..count..
),
# col = hue_pal()(3)[1],
size = 0.75
) +
geom_density(
aes(
x = mean_weighted_award_value,
col = "mean_weighted_award_value",
..count..
),
# col = hue_pal()(3)[2],
size = 0.75
) +
scale_x_continuous(
name = "Weighted Amount Awarded",
labels = label_dollar(prefix = "£"),
trans = pseudo_log_trans(base = 10),
breaks = c(0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9),
minor_breaks = break_log_cust(0:9),
expand = expansion(mult = c(0, .05))
) +
scale_y_continuous(
name = "Count",
breaks_extended(10),
expand = expansion(mult = c(0, .05))
) +
scale_color_discrete(
name = NULL,
labels = c("Mean", "Total")
) +
labs(
title = "Distribution of Weighted Amount Awarded to Each Subject"
) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.justification=c(1,1),
legend.position=c(1,1)
)
grants_x_subjects_summary %>%
ggplot() +
geom_density(
aes(
x = total_number_of_grants,
..count..
),
col = hue_pal()(3)[3],
size = 0.75
) +
scale_x_continuous(
name = "Number of Grants",
labels = number,
trans = pseudo_log_trans(base = 10),
breaks = c(0, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9),
minor_breaks = break_log_cust(0:9),
expand = expansion(mult = c(0, .05))
) +
scale_y_continuous(
name = "Count",
breaks_extended(10),
expand = expansion(mult = c(0, .05))
) +
labs(
title = "Distribution of Total Number of Grants Under Each Subject",
subtitle = "Including grants with a subject of NA"
)
grants_x_topics_summary %>%
ggplot() +
geom_density(
aes(
x = total_weighted_award_value,
col = "total_weighted_award_value",
..count..
),
# col = hue_pal()(3)[1],
size = 0.75
) +
geom_density(
aes(
x = mean_weighted_award_value,
col = "mean_weighted_award_value",
..count..
),
# col = hue_pal()(3)[2],
size = 0.75
) +
scale_x_continuous(
name = "Weighted Amount Awarded",
labels = label_dollar(prefix = "£"),
trans = pseudo_log_trans(base = 10),
breaks = c(0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9),
minor_breaks = break_log_cust(0:9),
expand = expansion(mult = c(0, .05))
) +
scale_y_continuous(
name = "Count",
breaks_extended(10),
expand = expansion(mult = c(0, .05))
) +
scale_color_discrete(
name = NULL,
labels = c("Mean", "Total")
) +
labs(
title = "Distribution of Weighted Amount Awarded to Each Topic"
) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.justification=c(1,1),
legend.position=c(1,1)
)
grants_x_topics_summary %>%
ggplot() +
geom_density(
aes(
x = total_number_of_grants,
..count..
),
col = hue_pal()(3)[3],
size = 0.75
) +
scale_x_continuous(
name = "Number of Grants",
labels = number,
trans = pseudo_log_trans(base = 10),
breaks = c(0, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9),
minor_breaks = break_log_cust(0:9),
expand = expansion(mult = c(0, .05))
) +
scale_y_continuous(
name = "Count",
breaks_extended(10),
expand = expansion(mult = c(0, .05))
) +
labs(
title = "Distribution of Total Number of Grants Under Each topic",
subtitle = "Including grants with a topic of NA"
)
grants_x_subjects_summary %>%
filter(!is.na(subject)) %>%
arrange(desc(total_weighted_award_value)) %>%
head(10) %>%
ggplot() +
geom_col(
aes(
x = total_weighted_award_value,
y = reorder(subject, total_weighted_award_value),
),
fill = hue_pal()(3)[1]
) +
# scale_x_log10(
# name = "Total Weighted Award Amount",
# labels = label_dollar(prefix = "£"),
# expand = expansion(mult = c(0, .01)),
# minor_breaks = break_log_cust(0:9)
# ) +
scale_x_continuous(
name = "Total Weighted Award Amount",
labels = label_dollar(prefix = "£"),
breaks = breaks_extended(),
minor_breaks = breaks_extended(20),
expand = expansion(mult = c(0, .01))
) +
scale_y_discrete(
name = ""
) +
labs(
title = "Top 10 Subjects by Total Weighted Amount Awarded",
subtitle = "NA not included"
)
grants_x_subjects_summary %>%
filter(!is.na(subject)) %>%
arrange(desc(mean_weighted_award_value)) %>%
head(10) %>%
ggplot() +
geom_col(
aes(
x = mean_weighted_award_value,
y = reorder(subject, mean_weighted_award_value),
),
fill = hue_pal()(3)[2]
) +
# scale_x_log10(
# name = "Total Weighted Award Amount",
# labels = label_dollar(prefix = "£"),
# expand = expansion(mult = c(0, .01)),
# minor_breaks = break_log_cust(0:9)
# ) +
scale_x_continuous(
name = "Mean Weighted Award Amount",
labels = label_dollar(prefix = "£"),
breaks = breaks_extended(),
minor_breaks = breaks_extended(20),
expand = expansion(mult = c(0, .01))
) +
scale_y_discrete(
name = ""
) +
labs(
title = "Top 10 Subjects by Mean Weighted Amount Awarded",
subtitle = "NA not included"
)
grants_x_subjects_summary %>%
filter(!is.na(subject)) %>%
arrange(desc(total_number_of_grants)) %>%
head(10) %>%
ggplot() +
geom_col(
aes(
x = total_number_of_grants,
y = reorder(subject, total_number_of_grants),
),
fill = hue_pal()(3)[3]
) +
# scale_x_log10(
# name = "Total Weighted Award Amount",
# labels = label_dollar(prefix = "£"),
# expand = expansion(mult = c(0, .01)),
# minor_breaks = break_log_cust(0:9)
# ) +
scale_x_continuous(
name = "Total Number of Grants",
breaks = breaks_extended(),
minor_breaks = breaks_extended(20),
expand = expansion(mult = c(0, .01))
) +
scale_y_discrete(
name = ""
) +
labs(
title = "Top 10 Subjects by Total Number of Grants",
subtitle = "NA not included"
)
### Topic
grants_x_topics_summary %>%
filter(topic != "Unclassified") %>%
arrange(desc(total_weighted_award_value)) %>%
head(10) %>%
ggplot() +
geom_col(
aes(
x = total_weighted_award_value,
y = reorder(topic, total_weighted_award_value),
),
fill = hue_pal()(3)[1]
) +
# scale_x_log10(
# name = "Total Weighted Award Amount",
# labels = label_dollar(prefix = "£"),
# expand = expansion(mult = c(0, .01)),
# minor_breaks = break_log_cust(0:9)
# ) +
scale_x_continuous(
name = "Total Weighted Award Amount",
labels = label_dollar(prefix = "£"),
breaks = breaks_extended(),
minor_breaks = breaks_extended(20),
expand = expansion(mult = c(0, .01))
) +
scale_y_discrete(
name = ""
) +
labs(
title = "Top 10 Topics by Total Weighted Amount Awarded",
subtitle = "Unclassified topics not included"
)
grants_x_topics_summary %>%
filter(topic != "Unclassified") %>%
arrange(desc(mean_weighted_award_value)) %>%
head(10) %>%
ggplot() +
geom_col(
aes(
x = mean_weighted_award_value,
y = reorder(topic, mean_weighted_award_value),
),
fill = hue_pal()(3)[2]
) +
# scale_x_log10(
# name = "Total Weighted Award Amount",
# labels = label_dollar(prefix = "£"),
# expand = expansion(mult = c(0, .01)),
# minor_breaks = break_log_cust(0:9)
# ) +
scale_x_continuous(
name = "Mean Weighted Award Amount",
labels = label_dollar(prefix = "£"),
breaks = breaks_extended(),
minor_breaks = breaks_extended(20),
expand = expansion(mult = c(0, .01))
) +
scale_y_discrete(
name = ""
) +
labs(
title = "Top 10 Topics by Mean Weighted Amount Awarded",
subtitle = "Unclassified topics not included"
)
grants_x_topics_summary %>%
filter(topic != "Unclassified") %>%
arrange(desc(total_number_of_grants)) %>%
head(10) %>%
ggplot() +
geom_col(
aes(
x = total_number_of_grants,
y = reorder(topic, total_number_of_grants),
),
fill = hue_pal()(3)[3]
) +
# scale_x_log10(
# name = "Total Weighted Award Amount",
# labels = label_dollar(prefix = "£"),
# expand = expansion(mult = c(0, .01)),
# minor_breaks = break_log_cust(0:9)
# ) +
scale_x_continuous(
name = "Total Number of Grants",
breaks = breaks_extended(),
minor_breaks = breaks_extended(20),
expand = expansion(mult = c(0, .01))
) +
scale_y_discrete(
name = ""
) +
labs(
title = "Top 10 Topics by Total Number of Grants",
subtitle = "Unclassified topics not included"
)
grants_x_person_roles <- left_join(datasets$grants, datasets$person_roles, by = "grant_ref") %>%
select(
person_id,
award_value,
grant_category,
start_date,
role
) %>%
filter(
grant_category != "Studentship"
) %>%
group_by(person_id) %>%
mutate(
grant_number_all = order(order(start_date))
) %>%
group_by(person_id, grant_category) %>%
mutate(
grant_number_category = order(order(start_date))
) %>%
group_by(person_id, role) %>%
mutate(
grant_number_role = order(order(start_date))
) %>%
group_by(person_id, role, grant_category) %>%
mutate(
grant_number_category_and_role = order(order(start_date))
) %>%
ungroup()
grants_x_person_roles
# grants_x_person_roles %>%
# group_by(
# grant_number_all
# ) %>%
# summarise(
# mean_award_value = mean(award_value)
# ) %>%
# ggplot(
# mapping = aes(
# x = grant_number_all,
# y = mean_award_value/1000
# )
# ) +
# geom_line() +
# scale_y_continuous(
# name = "Mean Award Value (£k)",
# labels = number,
# breaks = breaks_extended(n = 10)
# ) +
# scale_x_continuous(
# name = "Number of Grants completed by Person",
# breaks = breaks_extended(n = 10),
# minor_breaks = breaks_width(1)
# )
grants_x_person_roles %>%
group_by(
grant_number_all
) %>%
mutate(
mean_award_value = mean(award_value)
) %>%
ungroup() %>%
ggplot(
mapping = aes(
x = grant_number_all,
y = value,
)
) +
geom_smooth(aes(y = award_value, col = "Award Value")) +
geom_line(aes(y = mean_award_value, col = "Mean Award Value")) +
scale_y_continuous(
name = "Award Value",
labels = label_dollar(prefix = "£"),
breaks = breaks_extended(n = 10),
expand = expansion(mult = c(.01, .01))
) +
scale_x_continuous(
name = "Number of Grants Completed by Person",
breaks = breaks_extended(n = 10),
minor_breaks = breaks_width(1),
expand = expansion(mult = c(0, .01))
) +
theme(legend.title = element_blank()) +
labs(
title = "Change in the Award Value as Number of Grants Awarded to a Person \nIncreases",
# subtitle = "Unclassified topics not included"
)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
grants_x_person_roles %>%
ggplot() +
geom_histogram(
mapping = aes(
grant_number_all,
fill = "Number of Grants",
),
bins = 40,
show.legend = FALSE
) +
# scale_x_log10(
# name = "Total Weighted Award Amount",
# labels = label_dollar(prefix = "£"),
# expand = expansion(mult = c(0, .01)),
# minor_breaks = break_log_cust(0:9)
# ) +
scale_x_continuous(
name = "Number of Grants Completed by Person",
breaks = breaks_extended(),
minor_breaks = breaks_extended(40),
expand = expansion(mult = c(0, .01))
) +
scale_y_continuous(
name = "Count",
expand = expansion(mult = c(0, .05))
) +
labs(
title = "Distribution of the Number of Grants Awarded to a Person",
# subtitle = "Unclassified topics not included"
)
grants_x_person_roles %>%
mutate(
role = fct_recode(
fct_lump_n(fct_explicit_na(role), n = 3),
"Co-Invesigator" = "CO_INVESTIGATOR",
"Principle Invesigator" = "PRINCIPAL_INVESTIGATOR",
"Training Grant Holder" = "TRAINING_GRANT_HOLDER",
"Other Roles" = "Other"
)
) %>%
ggplot() +
geom_smooth(
aes(
x = grant_number_role,
y = award_value,
fill = role,
col = role,
),
show.legend = FALSE
) +
facet_wrap(
facets = vars(role),
scales = "free"
) +
scale_y_continuous(
name = "Award Value",
labels = label_dollar(prefix = "£")
) +
scale_x_continuous(
name = "Number of Grants Completed by Person in that Role",
) +
labs(
title = "Change in the Award Value as Number of Grants Awarded to a Person \nIncreases",
subtitle = "Grouped by Person Role"
)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
grants_x_person_roles %>%
ggplot() +
geom_smooth(
aes(
x = grant_number_role,
y = award_value,
fill = grant_category,
col = grant_category,
),
show.legend = FALSE
) +
facet_wrap(
facets = vars(grant_category),
scales = "free"
) +
scale_y_continuous(
name = "Award Value",
labels = label_dollar(prefix = "£")
) +
scale_x_continuous(
name = "Number of Grants Completed by Person in that Role",
) +
labs(
title = "Change in the Award Value as Number of Grants Awarded to a Person \nIncreases",
subtitle = "Grouped by Grant Type"
)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'